#!/usr/bin/env python 
# -*- coding:utf-8 -*-
'''
@File    :   data_ext.py    
@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2022/4/20 0020 17:22   st      1.0         None

肺炎数据预处理
'''
import os

data_tag = '肺肿瘤'
file_dir = '../data/matadata/' + data_tag
matadata_path = os.path.join(file_dir, data_tag + '.txt')

pat_title = '^【标题】:(.+)'
pat_id = '^【流水号】:(.+)'
pat_abstract = '^【摘要】:(.+)'

index = 1
datas = []
temp = dict()
for line in open(matadata_path, 'r', encoding='utf-8'):
    line = line.strip()
    if line == str(index)+'.':
        if temp:
            datas.append(temp)
            temp = dict()
        index += 1
        continue
    if line.startswith('【标题】:'):
        temp['title'] = line[5:]
        continue

    if line.startswith('【流水号】:'):
        temp['ls'] = line[6:]
        continue

    if line.startswith('【摘要】:'):
        temp['zy'] = line[5:]
        continue
with open(os.path.join(file_dir, 'matadata_' + data_tag + '.txt'), 'w', encoding='utf-8') as f:
    f.truncate()
    datas = [data['ls'] + '\t' + data['title'] + '\n' + data['ls'] + '\t' + data['zy'] for data in datas]
    f.write('\n'.join(datas))

